This notebook explores the use of SingleR to perform cell-type annotation on datasets from the ScPCA project SCPCP000007 (Gawad lab data).

Note: The first time you run this code it may take a few more minutes due to reference downloads, but they will be cached for faster future execution.

0.1 Set Up

# load the R project
project_root <- here::here()
renv::load(project_root)
* Project '~/ALSF/scpca/sc-data-integration' loaded. [renv 0.15.5]
suppressPackageStartupMessages({
  library(SingleCellExperiment)
  library(SingleR)
  library(celldex)
  library(ggplot2)
})
theme_set(theme_bw())
set.seed(params$seed) # unclear if this is doing anything? probably not. but maybe later!

utils_dir <- file.path(project_root, "scripts", "utils")
source(file.path(utils_dir, "integration-helpers.R"))

sce_file_suffix <- "processed_citeseq.rds"

integrated_sce_file <- file.path(params$integrated_sce_dir, paste0(
  params$project_id, "_integrated_", params$integration_method, "_sce.rds"
))
if (!file.exists(integrated_sce_file)){
  stop("Integrated SCE file could not be found.")
}

Read in the data:

library_metadata_df <- readr::read_tsv(params$library_file)
Rows: 25 Columns: 16── Column specification ─────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (15): project_name, submitter, library_biomaterial_id, sample_biomaterial_id, diagnosis, ...
lgl  (1): has_CITEseq
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
integrated_sce <- readr::read_rds(integrated_sce_file)

# Define the unintegrated SCE filenames
sce_file_paths <- library_metadata_df %>%
  dplyr::filter(project_name == params$project_id) %>%
  dplyr::mutate(sce_file_path = file.path(
    integration_input_dir, sample_biomaterial_id, glue::glue("{library_biomaterial_id}_{sce_file_suffix}")
  )) %>%
  dplyr::pull(sce_file_path)

0.2 SingleR annotation

Here we performn celltype annotation with the given reference in params$reference on each of the Gawad libraries and look at their UMAPs colored by celltype.

  umap_df <- tibble::as_tibble(reducedDim(annotation_output[["sce"]], "UMAP")) %>%
    dplyr::select(UMAP1 = V1, UMAP2 = V2) %>%
    dplyr::mutate(celltypes = colData(annotation_output[["sce"]])[,colname])
Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if `.name_repair` is omitted as of tibble 2.0.0.
Using compatibility `.name_repair`.

Here we go!

# Read in all SCE files
sce_list <- purrr::map(
  sce_file_paths, 
  readr::read_rds
)

# Annotate them all, popping out some viz along the way
sce_list_annotated <- purrr::map(sce_list, run_SingleR, viz = TRUE)
[1] "SCPCL000295"
[1] "SCPCL000296"
[1] "SCPCL000297"
[1] "SCPCL000298"
[1] "SCPCL000299"

# for sanity during dev:
readr::write_rds(sce_list_annotated, "sce_list_annotated.rds")

Now let’s plot, to start, the fastMNN UMAP but with celltype annotations from individual libraries. As is deeply commented all over the place, we definitely need to infuse palettes with biology!

# helper to count the number of celltypes across all libraries
count_celltypes <- function(sce) {
 tibble::tibble(celltype = colData(sce)$SingleR_annotations ) %>%
    dplyr::count(celltype) %>%
    dplyr::arrange(-n) %>%
    tidyr::drop_na() 
}

# Celltype order based on **overall** counts for all pooled annotations 
celltype_order <- purrr::map_df(sce_list_annotated, count_celltypes) %>%
  dplyr::group_by(celltype) %>%
  dplyr::summarize(celltype_counts = sum(n)) %>%
  dplyr::arrange(-celltype_counts) %>%
  dplyr::pull(celltype)


# Again we'd eventually like some biology to go into the color choices!
cell_colors <- rainbow(length(celltype_order))



# Find all the annotations for a combined color palette:
# This function is also used later for ADT extraction
extract_annotation <- function(sce, annotation_column_name) {
 tibble::tibble(batch = unique(metadata(sce)$library),
                celltype = colData(sce)[,annotation_column_name], 
                cell_barcode = rownames(colData(sce))) %>%
    dplyr::mutate(cell_name = paste(cell_barcode, batch, sep = "-"))
}


# integrated UMAP with individual library cell annotations colored
purrr::map_df(sce_list_annotated, extract_annotation, "SingleR_annotations") %>%
  dplyr::inner_join(
    tibble::as_tibble(colData(integrated_sce), rownames = "cell_name")
  ) %>%
  dplyr::bind_cols(
    as.data.frame(reducedDim(integrated_sce, paste0(params$integration_method, "_UMAP")))
  ) %>%
  dplyr::mutate(UMAP1 = V1, UMAP2 = V2) %>%
  # And let's make a UMAP!
  ggplot() +
  aes(x = UMAP1, y = UMAP2, color = celltype) + 
  geom_point(size = 0.2, alpha = 0.3) + 
  # we definitely want some biology here!
  scale_color_manual(values = cell_colors)
Joining, by = c("batch", "cell_name")

0.3 ADTs

After annotating, we’d like to validate with ADTs.

find_max_adt <- function(sce) {
  max_adt_indices <- apply(
    logcounts(altExp(sce)),
    2,
    which.max
  )
  # associated rownames which are the ADT names
  max_adt <- rownames(altExp(sce))[max_adt_indices]
  
  # add back to sce and return
  sce$max_adt <- max_adt
  return(sce)
}


sce_list_adt <- purrr::map(sce_list_annotated, find_max_adt)


# What are the unique ADTs for each library?
purrr::map(
  lapply(sce_list_adt, `[[`, "max_adt"), 
  unique
)
[[1]]
 [1] "CD36"    "CD45"    "CD33"    "CD71"    "CD32"    "CD45RA"  "CD123"   "CD3"     "CD90"   
[10] "CD19"    "CD235ab" "CD10"    "CD7"    

[[2]]
[1] "CD33"   "CD45"   "CD36"   "CD10"   "CD123"  "CD71"   "CD366"  "CD45RA"

[[3]]
 [1] "CD45"   "CD49f"  "CD71"   "CD33"   "CD32"   "CD123"  "CD10"   "CD36"   "CD7"    "CD274" 
[11] "CD45RA" "CD90"   "CD3"   

[[4]]
 [1] "CD123"  "CD45RA" "CD71"   "CD45"   "CD36"   "CD32"   "CD19"   "CD7"    "CD49f"  "CD3"   
[11] "CD10"   "CD90"   "CD133"  "CD25"   "CD366"  "CD274" 

[[5]]
 [1] "CD45"    "CD49f"   "CD71"    "CD32"    "CD235ab" "CD90"    "CD45RA"  "CD7"     "CD3"    
[10] "CD274"   "CD123"   "CD25"    "CD366"   "CD36"    "CD10"    "CD279"  
# What are the ADT distributions for each library?
purrr::map(
  lapply(sce_list_adt, `[[`, "max_adt"), 
  table
)
[[1]]

   CD10   CD123    CD19 CD235ab     CD3    CD32    CD33    CD36    CD45  CD45RA     CD7    CD71 
      6     366       7      68       5     836    2445     292     626      15      13      19 
   CD90 
      2 

[[2]]

  CD10  CD123   CD33   CD36  CD366   CD45 CD45RA   CD71 
     2      9   2845      7      1    289     17      4 

[[3]]

  CD10  CD123  CD274    CD3   CD32   CD33   CD36   CD45 CD45RA  CD49f    CD7   CD71   CD90 
     6     10      4      1     56      4      1   3289      2    299      2     10      3 

[[4]]

  CD10  CD123  CD133   CD19   CD25  CD274    CD3   CD32   CD36  CD366   CD45 CD45RA  CD49f 
     1   1747      1     14      4      3     12     17     31      1    726   1443     22 
   CD7   CD71   CD90 
    76    143     63 

[[5]]

   CD10   CD123 CD235ab    CD25   CD274   CD279     CD3    CD32    CD36   CD366    CD45  CD45RA 
      1      18       3       4       9       4      16       5       1       1     548       1 
  CD49f     CD7    CD71    CD90 
    109      14      17      38 
# What's the most common ADT distributions for each library?
most_common <- function(a_list) {
  sort(table(a_list), decreasing = TRUE)[1] %>%
    names()
}
purrr::map(
  lapply(sce_list_adt, `[[`, "max_adt"), 
  most_common
)
[[1]]
[1] "CD33"

[[2]]
[1] "CD33"

[[3]]
[1] "CD45"

[[4]]
[1] "CD123"

[[5]]
[1] "CD45"
  • “CD33…is a transmembrane receptor expressed on cells of myeloid lineage. It is usually considered myeloid-specific, but it can also be found on some lymphoid cells.” source
  • “CD45 is a type I transmembrane protein that is present in various isoforms on all differentiated hematopoietic cells (except erythrocytes and plasma cells).” source
  • “The interleukin-3 receptor (CD123) is a molecule found on cells which helps transmit the signal of interleukin-3, a soluble cytokine important in the immune system….The receptor, found on pluripotent progenitor cells, induces tyrosine phosphorylation within the cell and promotes proliferation and differentiation within the hematopoietic cell lines. It can be found on basophils and pDCs as well as some cDCs among peripheral blood mononuclear cells….CD123 is expressed across acute myeloid leukemia (AML) subtypes, including leukemic stem cells.” source

UMAP of the ADTs:

purrr::map_df(sce_list_adt, extract_annotation, "max_adt") %>%
  dplyr::inner_join(
    tibble::as_tibble(colData(integrated_sce), rownames = "cell_name")
  ) %>%
  dplyr::bind_cols(
    as.data.frame(reducedDim(integrated_sce, paste0(params$integration_method, "_UMAP")))
  ) %>%
  dplyr::mutate(UMAP1 = V1, UMAP2 = V2) %>%
  # And let's make a UMAP!
  ggplot() +
  aes(x = UMAP1, y = UMAP2, color = celltype) + 
  geom_point(size = 0.2, alpha = 0.3)
Joining, by = c("batch", "cell_name")

0.4 Unevaluated initial exploration

Currently the code chunks in this section are not evaluated, but remain here for now for posterity.

To build an initial sense of what we can expect annotating with SingleR, let’s just look at one SCE (arbitrarily chosen as the first):

# For now let's just explore one!
sce_file <- sce_file_paths[[1]]

sce <- readr::read_rds(sce_file)
sce

The celldex package contains bulk RNA-Seq datasets for use as reference. Since this is AML data, we’ll want a reference that has a lot of blood information. According to the celldex, the Human Primary Cell Atlas data will be our closest match.

The HPCA reference consists of publicly available microarray datasets derived from human primary cells (Mabbott et al. 2013). Most of the labels refer to blood subpopulations but cell types from other tissues are also available.

# define reference with ensembl IDs to match our IDs
ref_se <- celldex::HumanPrimaryCellAtlasData(ensembl = TRUE)

# Predict cell types
#  If we change to `labels = ref_se$label.fine`, we'll get more 
#  fine-grained annotations with subtypes etc.
preds_hpca <- SingleR::SingleR(
  # dataset we want to annotate
  test = sce, 
  # reference dataset
  ref = ref_se,
  # label.main is broad 
  labels = ref_se$label.main)


# Results into a tibble:
preds_df <- tibble::as_tibble(preds_hpca$labels) %>%
  dplyr::rename(celltype = value) %>%
  dplyr::mutate(cell_barcode = rownames(preds_hpca),
                reference = "hpca")

# Save the celltypes:
hpca_celltypes <- unique(ref_se$label.main)

And a heatmap version, showing all celltypes. The bar the top shows the final assignment, which are the rows with highest scores.

SingleR::plotScoreHeatmap(preds_hpca)

But can’t hurt to see how this compares to the Blueprint/ENCODE reference:

The Blueprint/ENCODE reference consists of bulk RNA-seq data for pure stroma and immune cells generated by Blueprint (Martens and Stunnenberg 2013) and ENCODE projects (The ENCODE Project Consortium 2012).

# define reference with ensembl IDs to match our IDs
ref_se <- celldex::BlueprintEncodeData(ensembl = TRUE)

# Predict cell types, broadly
preds_blue_enc <- SingleR::SingleR(
  # dataset we want to annotate
  test = sce, 
  # reference dataset
  ref = ref_se,
  # label.main is broad 
  labels = ref_se$label.main)


# Results into a tibble:
preds_df <- preds_df %>%
  dplyr::bind_rows(
    tibble::as_tibble(preds_blue_enc$labels) %>%
    dplyr::rename(celltype = value) %>%
    dplyr::mutate(cell_barcode = rownames(preds_blue_enc),
                  reference = "blueprint_encode")
  )

# Save the celltypes:
blueprint_celltypes <- unique(ref_se$label.main)

# And the heatmap:
SingleR::plotScoreHeatmap(preds_blue_enc)

What did the references predict?

# How many of each celltype?
preds_df %>%
  dplyr::filter(reference == "hpca") %>%
  dplyr::count(celltype) %>%
  dplyr::arrange(-n)


preds_df %>%
  dplyr::filter(reference == "blueprint_encode") %>%
  dplyr::count(celltype) %>%
  dplyr::arrange(-n)

We’ll have to harmonize the celltype names between references to do a robust comparison, but from a very quick glance, overlap is thinner than one might like. That said, we don’t necessarily expect Blueprint/ENCODE to do particularly well anyways!

For a clearer comparison, we’ll harmonize predicted celltypes, but let’s just focus on exactly matching celltypes as a start -

# Let's see what we have here:
sort(hpca_celltypes)
sort(blueprint_celltypes)

# Manually compiled data rame of celltypes roughly present in BOTH references
#  for now leaving the Pre/Pro B-cells out
shared_celltypes_df <- tibble::tribble(
  ~hpca_celltype, ~blueprint_celltype,
  # Both references have it:
  "Astrocyte", "Astrocytes",
  "B_cell", "B-cells",
  "Chondrocytes","Chondrocytes",
  "DC", "DC", 
  "Endothelial_cells", "Endothelial cells", 
  "Epithelial_cells", "Epithelial cells", 
  "Fibroblasts", "Fibroblasts", 
  "Keratinocytes", "Keratinocytes", 
  "Macrophage", "Macrophages", 
  "Monocyte", "Monocytes", 
  "Neurons", "Neurons", 
  "Neutrophils", "Neutrophils", 
  "NK_cell", "NK cells", 
  "Smooth_muscle_cells", "Smooth muscle", 
  # two groups - collapse back to overall T cells
  "T_cells", "CD8+ T-cells",
  "T_cells", "CD4+ T-cells",
  # two groups of HSCs, again, collapse back to overall
  "HSC_-G-CSF", "HSC", 
  "HSC_CD34+", "HSC"
) %>%
  dplyr::mutate(harmonized_celltype = ifelse(blueprint_celltype == "HSC",
                                             blueprint_celltype, 
                                             hpca_celltype))

harmonize_celltypes <- function(preds_df, reference_name) {
  if (reference_name == "hpca") {
    shared_celltypes_colname <- rlang::sym("hpca_celltype")
  } else {
    shared_celltypes_colname <- rlang::sym("blueprint_celltype")
  }
  
  filtered_preds_df <- preds_df %>%
    dplyr::filter(reference == reference_name) 
  
  sym_reference_name <- rlang::sym(reference_name)
  filtered_preds_df %>%
    dplyr::inner_join(
      dplyr::select(
        shared_celltypes_df, 
        celltype = {{shared_celltypes_colname}},
        harmonized_celltype
      )
    ) %>% 
    dplyr::select(cell_barcode, {{sym_reference_name}} := harmonized_celltype) %>%
    dplyr::right_join(filtered_preds_df) %>%
    dplyr::mutate({{reference_name}} := dplyr::if_else(
      {{sym_reference_name}} %in% shared_celltypes_df$harmonized_celltype,
      {{sym_reference_name}},
      celltype
    )) %>% 
    dplyr::select(-celltype, -reference)
  }
  
preds_df_harmonized <- harmonize_celltypes(preds_df, "blueprint_encode") %>%
  dplyr::left_join(
    harmonize_celltypes(preds_df, "hpca")  
  )

# How often do they match?

preds_df_harmonized %>%
  dplyr::summarize(percent_same = sum(blueprint_encode == hpca) / dplyr::n() )


# Where do they not match?
preds_df_harmonized %>%
  dplyr::filter(hpca != blueprint_encode)
  
# Are there specific combinations that get differently annotated?
preds_df_harmonized %>%
  dplyr::filter(hpca != blueprint_encode) %>%
  dplyr::select(-cell_barcode) %>%
  dplyr::count(blueprint_encode, hpca) %>%
  dplyr::arrange(-n)

0.5 Session Info

sessionInfo()
R version 4.2.1 (2022-06-23)
Platform: x86_64-apple-darwin17.0 (64-bit)
Running under: macOS Monterey 12.6.1

Matrix products: default
LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] stats4    stats     graphics  grDevices datasets  utils     methods   base     

other attached packages:
 [1] ensembldb_2.20.2            AnnotationFilter_1.20.0     GenomicFeatures_1.48.3     
 [4] AnnotationDbi_1.58.0        magrittr_2.0.3              ggplot2_3.3.6              
 [7] celldex_1.6.0               SingleR_1.10.0              SingleCellExperiment_1.18.0
[10] SummarizedExperiment_1.26.1 Biobase_2.56.0              GenomicRanges_1.48.0       
[13] GenomeInfoDb_1.32.3         IRanges_2.30.0              S4Vectors_0.34.0           
[16] BiocGenerics_0.42.0         MatrixGenerics_1.8.1        matrixStats_0.62.0         

loaded via a namespace (and not attached):
  [1] colorspace_2.0-3              rjson_0.2.21                  ellipsis_0.3.2               
  [4] modeltools_0.2-23             rprojroot_2.0.3               XVector_0.36.0               
  [7] BiocNeighbors_1.14.0          rstudioapi_0.13               farver_2.1.1                 
 [10] flexmix_2.3-18                bit64_4.0.5                   interactiveDisplayBase_1.34.0
 [13] fansi_1.0.3                   xml2_1.3.3                    codetools_0.2-18             
 [16] splines_4.2.1                 sparseMatrixStats_1.8.0       cachem_1.0.6                 
 [19] knitr_1.39                    Rsamtools_2.12.0              dbplyr_2.2.1                 
 [22] png_0.1-7                     pheatmap_1.0.12               shiny_1.7.2                  
 [25] BiocManager_1.30.18           readr_2.1.2                   compiler_4.2.1               
 [28] httr_1.4.3                    lazyeval_0.2.2                assertthat_0.2.1             
 [31] Matrix_1.4-1                  fastmap_1.1.0                 cli_3.3.0                    
 [34] later_1.3.0                   BiocSingular_1.12.0           miQC_1.4.0                   
 [37] htmltools_0.5.3               prettyunits_1.1.1             tools_4.2.1                  
 [40] rsvd_1.0.5                    gtable_0.3.0                  glue_1.6.2                   
 [43] GenomeInfoDbData_1.2.8        dplyr_1.0.9                   rappdirs_0.3.3               
 [46] Rcpp_1.0.9                    vctrs_0.4.1                   Biostrings_2.64.0            
 [49] ExperimentHub_2.4.0           rtracklayer_1.56.1            DelayedMatrixStats_1.18.0    
 [52] xfun_0.32                     stringr_1.4.0                 beachmat_2.12.0              
 [55] mime_0.12                     lifecycle_1.0.1               irlba_2.3.5                  
 [58] restfulr_0.0.15               renv_0.15.5                   XML_3.99-0.10                
 [61] AnnotationHub_3.4.0           zlibbioc_1.42.0               scales_1.2.0                 
 [64] vroom_1.5.7                   ProtGenerics_1.28.0           hms_1.1.1                    
 [67] promises_1.2.0.1              parallel_4.2.1                RColorBrewer_1.1-3           
 [70] yaml_2.3.5                    curl_4.3.2                    gridExtra_2.3                
 [73] memoise_2.0.1                 biomaRt_2.52.0                stringi_1.7.8                
 [76] RSQLite_2.2.15                BiocVersion_3.15.2            BiocIO_1.6.0                 
 [79] ScaledMatrix_1.4.0            filelock_1.0.2                BiocParallel_1.30.3          
 [82] rlang_1.0.4                   pkgconfig_2.0.3               bitops_1.0-7                 
 [85] lattice_0.20-45               purrr_0.3.4                   labeling_0.4.2               
 [88] GenomicAlignments_1.32.1      bit_4.0.4                     tidyselect_1.1.2             
 [91] here_1.0.1                    R6_2.5.1                      generics_0.1.3               
 [94] DelayedArray_0.22.0           DBI_1.1.3                     pillar_1.8.0                 
 [97] withr_2.5.0                   KEGGREST_1.36.3               RCurl_1.98-1.8               
[100] nnet_7.3-17                   tibble_3.1.8                  crayon_1.5.1                 
[103] utf8_1.2.2                    BiocFileCache_2.4.0           tzdb_0.3.0                   
[106] viridis_0.6.2                 progress_1.2.2                grid_4.2.1                   
[109] blob_1.2.3                    digest_0.6.29                 xtable_1.8-4                 
[112] tidyr_1.2.0                   httpuv_1.6.5                  munsell_0.5.0                
[115] viridisLite_0.4.0            
---
params:
  seed: 2022
  library_file: !r file.path("sample-info", "scpca-processed-libraries.tsv")
  project_id: "SCPCP000007"
  reference: "hpca"
  integrated_sce_dir: !r file.path("results", "scpca", "integrated_sce")
  integration_method: "fastmnn"
  max_celltypes: 5
title: "Cell type annotation exploration"
author: "Data Lab"
date: "`r params$date`"
output:
  html_notebook:
    toc: true
    toc_depth: 3
    toc_float: true
    number_sections: true
---

This notebook explores the use of [`SingleR`](https://bioconductor.org/books/release/SingleRBook/) to perform cell-type annotation on datasets from the ScPCA project SCPCP000007 (Gawad lab data).

**Note:** The first time you run this code it may take a few more minutes due to reference downloads, but they will be cached for faster future execution.

## Set Up

```{r setup}
# load the R project
project_root <- here::here()
renv::load(project_root)

suppressPackageStartupMessages({
  library(SingleCellExperiment)
  library(SingleR)
  library(celldex)
  library(ggplot2)
})
theme_set(theme_bw())
set.seed(params$seed) # unclear if this is doing anything? probably not. but maybe later!

utils_dir <- file.path(project_root, "scripts", "utils")
source(file.path(utils_dir, "integration-helpers.R"))

sce_file_suffix <- "processed_citeseq.rds"

integrated_sce_file <- file.path(params$integrated_sce_dir, paste0(
  params$project_id, "_integrated_", params$integration_method, "_sce.rds"
))
if (!file.exists(integrated_sce_file)){
  stop("Integrated SCE file could not be found.")
}
```

Read in the data:

```{r}
library_metadata_df <- readr::read_tsv(params$library_file)

integrated_sce <- readr::read_rds(integrated_sce_file)

# Define the unintegrated SCE filenames
sce_file_paths <- library_metadata_df %>%
  dplyr::filter(project_name == params$project_id) %>%
  dplyr::mutate(sce_file_path = file.path(
    integration_input_dir, sample_biomaterial_id, glue::glue("{library_biomaterial_id}_{sce_file_suffix}")
  )) %>%
  dplyr::pull(sce_file_path)


```


## `SingleR` annotation

Here we performn celltype annotation with the given reference in `params$reference` on each of the Gawad libraries and look at their UMAPs colored by celltype.

```{r}
if (params$reference == "hpca") {
  ref_data <- celldex::HumanPrimaryCellAtlasData(ensembl = TRUE)
} else if (params$reference == "blueprint_encode") {
  ref_data <- celldex::BlueprintEncodeData(ensembl = TRUE)
} else {
  stop("Bad reference parameter; either 'hpca' or 'blueprint_encode'.")
}


annotate_SingleR <- function(sce, ref_data) {
  # return updated sce and the predictions themselves
  
  preds <- SingleR::SingleR(
    test = sce, 
    ref = ref_data,
    labels = ref_data$label.main
  )
  
  # Add `pruned.labels`, where low-confidence annotations are NA, to sce
   sce$SingleR_annotations <- preds$pruned.labels
  
   return(
     list(
       sce = sce,
       preds = preds
       )
   )
}

plot_SingleR <- function(annotation_output, colname) {
  # annotation_output: list of sce and preds
  # colname: column name to plot
  # Make a heatmap and UMAP and print out
  
  heatmap <- SingleR::plotScoreHeatmap(annotation_output[["preds"]], 
                                       # default but let's be explicit
                                       show.pruned = FALSE)

  
  # In case, for `levels()` below.
  colData(annotation_output[["sce"]])[,colname] <- as.factor(colData(annotation_output[["sce"]])[,colname])
  
  umap_df <- tibble::as_tibble(reducedDim(annotation_output[["sce"]], "UMAP")) %>%
    dplyr::select(UMAP1 = V1, UMAP2 = V2) %>%
    dplyr::mutate(celltypes = colData(annotation_output[["sce"]])[,colname])
  
  # We would probably like a more principled approach to colors that is
  #  actually based on biological information about celltypes
  plot_colors <- rainbow( length(levels(umap_df$celltypes)) )
  
  umap <- ggplot(umap_df) + 
    aes(x = UMAP1, y = UMAP2, color = celltypes) +
    geom_point(size = 0.2, alpha = 0.5) + 
    scale_color_manual(values = plot_colors) +
    guides(color = guide_legend(override.aes = list(size=3))) +
    ggtitle(glue::glue("UMAP with top {params$max_celltypes} celltypes shown"))
  
  print(heatmap)
  print(umap)
  
}

# Function to run and plot SingleR
# Return SCE with annotations `SingleR_annotations` column
run_SingleR <- function(sce, 
                        max_celltypes = params$max_celltypes, 
                        viz = TRUE) {
  # Print out the library
  print(unique( metadata(sce)$library ))
  
  # Run annotations
  anno <- annotate_SingleR(sce, ref_data)
  
  # Create an additional column `SingleR_annotations_collapsed` with only max_celltypes
  anno[["sce"]]$SingleR_annotations_collapsed <- forcats::fct_lump_n(
    anno[["sce"]]$SingleR_annotations, 
    max_celltypes
  )
  
  # plot if TRUE
  if (viz) {
    # there is no interesting color palette harmonization among library colors here!
    plot_SingleR(anno, "SingleR_annotations_collapsed")
  }
  return(anno[["sce"]])
}
```

Here we go!

```{r}
# Read in all SCE files
sce_list <- purrr::map(
  sce_file_paths, 
  readr::read_rds
)

# Annotate them all, popping out some viz along the way
sce_list_annotated <- purrr::map(sce_list, run_SingleR, viz = TRUE)

# for sanity during dev:
readr::write_rds(sce_list_annotated, "sce_list_annotated.rds")
```



Now let's plot, to start, the fastMNN UMAP but with celltype annotations from individual libraries.
As is deeply commented all over the place, we definitely need to infuse palettes with biology!

```{r}
# helper to count the number of celltypes across all libraries
count_celltypes <- function(sce) {
 tibble::tibble(celltype = colData(sce)$SingleR_annotations ) %>%
    dplyr::count(celltype) %>%
    dplyr::arrange(-n) %>%
    tidyr::drop_na() 
}

# Celltype order based on **overall** counts for all pooled annotations 
celltype_order <- purrr::map_df(sce_list_annotated, count_celltypes) %>%
  dplyr::group_by(celltype) %>%
  dplyr::summarize(celltype_counts = sum(n)) %>%
  dplyr::arrange(-celltype_counts) %>%
  dplyr::pull(celltype)


# Again we'd eventually like some biology to go into the color choices!
cell_colors <- rainbow(length(celltype_order))



# Find all the annotations for a combined color palette:
# This function is also used later for ADT extraction
extract_annotation <- function(sce, annotation_column_name) {
 tibble::tibble(batch = unique(metadata(sce)$library),
                celltype = colData(sce)[,annotation_column_name], 
                cell_barcode = rownames(colData(sce))) %>%
    dplyr::mutate(cell_name = paste(cell_barcode, batch, sep = "-"))
}


# integrated UMAP with individual library cell annotations colored
purrr::map_df(sce_list_annotated, extract_annotation, "SingleR_annotations") %>%
  dplyr::inner_join(
    tibble::as_tibble(colData(integrated_sce), rownames = "cell_name")
  ) %>%
  dplyr::bind_cols(
    as.data.frame(reducedDim(integrated_sce, paste0(params$integration_method, "_UMAP")))
  ) %>%
  dplyr::mutate(UMAP1 = V1, UMAP2 = V2) %>%
  # And let's make a UMAP!
  ggplot() +
  aes(x = UMAP1, y = UMAP2, color = celltype) + 
  geom_point(size = 0.2, alpha = 0.3) + 
  # we definitely want some biology here!
  scale_color_manual(values = cell_colors)
```



## ADTs

After annotating, we'd like to validate with ADTs.

```{r}
find_max_adt <- function(sce) {
  max_adt_indices <- apply(
    logcounts(altExp(sce)),
    2,
    which.max
  )
  # associated rownames which are the ADT names
  max_adt <- rownames(altExp(sce))[max_adt_indices]
  
  # add back to sce and return
  sce$max_adt <- max_adt
  return(sce)
}


sce_list_adt <- purrr::map(sce_list_annotated, find_max_adt)


# What are the unique ADTs for each library?
purrr::map(
  lapply(sce_list_adt, `[[`, "max_adt"), 
  unique
)

# What are the ADT distributions for each library?
purrr::map(
  lapply(sce_list_adt, `[[`, "max_adt"), 
  table
)

# What's the most common ADT distributions for each library?
most_common <- function(a_list) {
  sort(table(a_list), decreasing = TRUE)[1] %>%
    names()
}
purrr::map(
  lapply(sce_list_adt, `[[`, "max_adt"), 
  most_common
)

```

- "CD33...is a transmembrane receptor expressed on cells of myeloid lineage. It is usually considered myeloid-specific, but it can also be found on some lymphoid cells." [source](https://en.wikipedia.org/wiki/CD33)
- "CD45 is a type I transmembrane protein that is present in various isoforms on all differentiated hematopoietic cells (except erythrocytes and plasma cells)." [source](https://en.wikipedia.org/wiki/CD45)
- "The interleukin-3 receptor (CD123) is a molecule found on cells which helps transmit the signal of interleukin-3, a soluble cytokine important in the immune system....The receptor, found on pluripotent progenitor cells, induces tyrosine phosphorylation within the cell and promotes proliferation and differentiation within the hematopoietic cell lines. It can be found on basophils and pDCs as well as some cDCs among peripheral blood mononuclear cells....CD123 is expressed across acute myeloid leukemia (AML) subtypes, including leukemic stem cells." [source](https://en.wikipedia.org/wiki/Interleukin-3_receptor)


UMAP of the ADTs:

```{r}
purrr::map_df(sce_list_adt, extract_annotation, "max_adt") %>%
  dplyr::inner_join(
    tibble::as_tibble(colData(integrated_sce), rownames = "cell_name")
  ) %>%
  dplyr::bind_cols(
    as.data.frame(reducedDim(integrated_sce, paste0(params$integration_method, "_UMAP")))
  ) %>%
  dplyr::mutate(UMAP1 = V1, UMAP2 = V2) %>%
  # And let's make a UMAP!
  ggplot() +
  aes(x = UMAP1, y = UMAP2, color = celltype) + 
  geom_point(size = 0.2, alpha = 0.3)
```




## Unevaluated initial exploration

> Currently the code chunks in this section are not evaluated, but remain here for now for posterity.

To build an initial sense of what we can expect annotating with `SingleR`, let's just look at one SCE (arbitrarily chosen as the first):

```{r, eval=F}
# For now let's just explore one!
sce_file <- sce_file_paths[[1]]

sce <- readr::read_rds(sce_file)
sce
```


The `celldex` package contains bulk RNA-Seq datasets for use as reference.
Since this is AML data, we'll want a reference that has a lot of blood information.
According to the [`celldex`](https://bioconductor.org/packages/release/data/experiment/vignettes/celldex/inst/doc/userguide.html), the `Human Primary Cell Atlas` data will be our closest match.

> The HPCA reference consists of publicly available microarray datasets derived from human primary cells (Mabbott et al. 2013). Most of the labels refer to blood subpopulations but cell types from other tissues are also available.


```{r, eval=F}
# define reference with ensembl IDs to match our IDs
ref_se <- celldex::HumanPrimaryCellAtlasData(ensembl = TRUE)

# Predict cell types
#  If we change to `labels = ref_se$label.fine`, we'll get more 
#  fine-grained annotations with subtypes etc.
preds_hpca <- SingleR::SingleR(
  # dataset we want to annotate
  test = sce, 
  # reference dataset
  ref = ref_se,
  # label.main is broad 
  labels = ref_se$label.main)


# Results into a tibble:
preds_df <- tibble::as_tibble(preds_hpca$labels) %>%
  dplyr::rename(celltype = value) %>%
  dplyr::mutate(cell_barcode = rownames(preds_hpca),
                reference = "hpca")

# Save the celltypes:
hpca_celltypes <- unique(ref_se$label.main)
```

And a heatmap version, showing all celltypes. 
The bar the top shows the final assignment, which are the rows with highest scores.

```{r, eval=F, fig.width = 8, fig.height = 5}
SingleR::plotScoreHeatmap(preds_hpca)
```

But can't hurt to see how this compares to the `Blueprint/ENCODE` reference:

> The Blueprint/ENCODE reference consists of bulk RNA-seq data for pure stroma and immune cells generated by Blueprint (Martens and Stunnenberg 2013) and ENCODE projects (The ENCODE Project Consortium 2012).


```{r, eval=F, fig.width = 8, fig.height = 5}
# define reference with ensembl IDs to match our IDs
ref_se <- celldex::BlueprintEncodeData(ensembl = TRUE)

# Predict cell types, broadly
preds_blue_enc <- SingleR::SingleR(
  # dataset we want to annotate
  test = sce, 
  # reference dataset
  ref = ref_se,
  # label.main is broad 
  labels = ref_se$label.main)


# Results into a tibble:
preds_df <- preds_df %>%
  dplyr::bind_rows(
    tibble::as_tibble(preds_blue_enc$labels) %>%
    dplyr::rename(celltype = value) %>%
    dplyr::mutate(cell_barcode = rownames(preds_blue_enc),
                  reference = "blueprint_encode")
  )

# Save the celltypes:
blueprint_celltypes <- unique(ref_se$label.main)

# And the heatmap:
SingleR::plotScoreHeatmap(preds_blue_enc)
```


What did the references predict?


```{r, eval=F}
# How many of each celltype?
preds_df %>%
  dplyr::filter(reference == "hpca") %>%
  dplyr::count(celltype) %>%
  dplyr::arrange(-n)


preds_df %>%
  dplyr::filter(reference == "blueprint_encode") %>%
  dplyr::count(celltype) %>%
  dplyr::arrange(-n)
```

We'll have to harmonize the celltype names between references to do a robust comparison, but from a _very_ quick glance, overlap is thinner than one might like.
That said, we don't necessarily expect Blueprint/ENCODE to do particularly well anyways!


For a clearer comparison, we'll harmonize predicted celltypes, but let's just focus on exactly matching celltypes as a start - 
```{r, eval=F}
# Let's see what we have here:
sort(hpca_celltypes)
sort(blueprint_celltypes)

# Manually compiled data rame of celltypes roughly present in BOTH references
#  for now leaving the Pre/Pro B-cells out
shared_celltypes_df <- tibble::tribble(
  ~hpca_celltype, ~blueprint_celltype,
  # Both references have it:
  "Astrocyte", "Astrocytes",
  "B_cell", "B-cells",
  "Chondrocytes","Chondrocytes",
  "DC", "DC", 
  "Endothelial_cells", "Endothelial cells", 
  "Epithelial_cells", "Epithelial cells", 
  "Fibroblasts", "Fibroblasts", 
  "Keratinocytes", "Keratinocytes", 
  "Macrophage", "Macrophages", 
  "Monocyte", "Monocytes", 
  "Neurons", "Neurons", 
  "Neutrophils", "Neutrophils", 
  "NK_cell", "NK cells", 
  "Smooth_muscle_cells", "Smooth muscle", 
  # two groups - collapse back to overall T cells
  "T_cells", "CD8+ T-cells",
  "T_cells", "CD4+ T-cells",
  # two groups of HSCs, again, collapse back to overall
  "HSC_-G-CSF", "HSC", 
  "HSC_CD34+", "HSC"
) %>%
  dplyr::mutate(harmonized_celltype = ifelse(blueprint_celltype == "HSC",
                                             blueprint_celltype, 
                                             hpca_celltype))

harmonize_celltypes <- function(preds_df, reference_name) {
  if (reference_name == "hpca") {
    shared_celltypes_colname <- rlang::sym("hpca_celltype")
  } else {
    shared_celltypes_colname <- rlang::sym("blueprint_celltype")
  }
  
  filtered_preds_df <- preds_df %>%
    dplyr::filter(reference == reference_name) 
  
  sym_reference_name <- rlang::sym(reference_name)
  filtered_preds_df %>%
    dplyr::inner_join(
      dplyr::select(
        shared_celltypes_df, 
        celltype = {{shared_celltypes_colname}},
        harmonized_celltype
      )
    ) %>% 
    dplyr::select(cell_barcode, {{sym_reference_name}} := harmonized_celltype) %>%
    dplyr::right_join(filtered_preds_df) %>%
    dplyr::mutate({{reference_name}} := dplyr::if_else(
      {{sym_reference_name}} %in% shared_celltypes_df$harmonized_celltype,
      {{sym_reference_name}},
      celltype
    )) %>% 
    dplyr::select(-celltype, -reference)
  }
  
preds_df_harmonized <- harmonize_celltypes(preds_df, "blueprint_encode") %>%
  dplyr::left_join(
    harmonize_celltypes(preds_df, "hpca")  
  )

# How often do they match?

preds_df_harmonized %>%
  dplyr::summarize(percent_same = sum(blueprint_encode == hpca) / dplyr::n() )


# Where do they not match?
preds_df_harmonized %>%
  dplyr::filter(hpca != blueprint_encode)
  
# Are there specific combinations that get differently annotated?
preds_df_harmonized %>%
  dplyr::filter(hpca != blueprint_encode) %>%
  dplyr::select(-cell_barcode) %>%
  dplyr::count(blueprint_encode, hpca) %>%
  dplyr::arrange(-n)

```

## Session Info

```{r session_info, eval=TRUE}
sessionInfo()
```
